In [1]:
import pandas as pd
df = pd.read_csv('Reviews.csv')
import numpy as np
In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
# Product Scores
fig = px.histogram(df, x="Score")
fig.update_traces(marker_color="turquoise",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Product Score')
fig.show()
In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
chachedWords = stopwords.words('english')
from wordcloud import WordCloud 

# Create stopword list:
stopwords = set(stopwords.words('english'))
stopwords.update(["br", "href"])
textt = " ".join(review for review in df.Text)
wordcloud = WordCloud(stopwords=stopwords).generate(textt)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud11.png')
plt.show()
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Humberto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [4]:
df['sentiment'] = df['Score'].apply(lambda rating : +1 if rating > 3 else -1)
In [5]:
df
Out[5]:
Id ProductId UserId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Time Summary Text sentiment
0 1 B001E4KFG0 A3SGXH7AUHU8GW delmartian 1 1 5 1303862400 Good Quality Dog Food I have bought several of the Vitality canned d... 1
1 2 B00813GRG4 A1D87F6ZCVE5NK dll pa 0 0 1 1346976000 Not as Advertised Product arrived labeled as Jumbo Salted Peanut... -1
2 3 B000LQOCH0 ABXLMWJIXXAIN Natalia Corres "Natalia Corres" 1 1 4 1219017600 "Delight" says it all This is a confection that has been around a fe... 1
3 4 B000UA0QIQ A395BORC6FGVXV Karl 3 3 2 1307923200 Cough Medicine If you are looking for the secret ingredient i... -1
4 5 B006K2ZZ7K A1UQRSCLF8GW1T Michael D. Bigham "M. Wassir" 0 0 5 1350777600 Great taffy Great taffy at a great price. There was a wid... 1
... ... ... ... ... ... ... ... ... ... ... ...
568449 568450 B001EO7N10 A28KG5XORO54AY Lettie D. Carter 0 0 5 1299628800 Will not do without Great for sesame chicken..this is a good if no... 1
568450 568451 B003S1WTCU A3I8AFVPEE8KI5 R. Sawyer 0 0 2 1331251200 disappointed I'm disappointed with the flavor. The chocolat... -1
568451 568452 B004I613EE A121AA1GQV751Z pksd "pk_007" 2 2 5 1329782400 Perfect for our maltipoo These stars are small, so you can give 10-15 o... 1
568452 568453 B004I613EE A3IBEVCTXKNOH Kathy A. Welch "katwel" 1 1 5 1331596800 Favorite Training and reward treat These are the BEST treats for training and rew... 1
568453 568454 B001LR2CU2 A3LGQPJCZVL9UC srfell17 0 0 5 1338422400 Great Honey I am very satisfied ,product is as advertised,... 1

568454 rows × 11 columns

In [6]:
positive = df[df['sentiment'] == 1]
negative = df[df['sentiment'] == -1]
In [7]:
stopwords = set(stopwords)
stopwords.update(["br", "href","good","great"])
pos = " ".join(review for review in positive.Summary)
wordcloud2 = WordCloud(stopwords=stopwords).generate(pos)
plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis("off")
plt.show()
In [16]:
negative['Summary'] = negative['Summary'].apply(str)
<ipython-input-16-d41b350b8f8f>:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [17]:
neg = " ".join(review for review in negative.Summary)
wordcloud3 = WordCloud(stopwords=stopwords).generate(neg)

plt.imshow(wordcloud3, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud33.png')
plt.show()
In [18]:
df['sentimentt'] = df['sentiment'].replace({-1 : 'negative'})
df['sentimentt'] = df['sentimentt'].replace({1 : 'positive'})
fig = px.histogram(df, x="sentimentt")
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Product Sentiment')
fig.show()
In [22]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!",'"'))
    return final
df['Text'] = df['Text'].apply(remove_punctuation)
df = df.dropna(subset=['Summary'])
df['Summary'] = df['Summary'].apply(remove_punctuation)
<ipython-input-22-903d5d021ed6>:4: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [26]:
dfNew = df[['Summary','sentiment']]
dfNew
Out[26]:
Summary sentiment
0 Good Quality Dog Food 1
1 Not as Advertised -1
2 Delight says it all 1
3 Cough Medicine -1
4 Great taffy 1
... ... ...
568449 Will not do without 1
568450 disappointed -1
568451 Perfect for our maltipoo 1
568452 Favorite Training and reward treat 1
568453 Great Honey 1

568427 rows × 2 columns

In [27]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]
In [31]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Summary'].values.astype('U'))
test_matrix = vectorizer.transform(test['Summary'].values.astype('U'))
In [32]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
In [33]:
X_train = train_matrix
X_test = test_matrix
In [35]:
train_matrix
Out[35]:
<447831x33008 sparse matrix of type '<class 'numpy.int64'>'
	with 1836177 stored elements in Compressed Sparse Row format>
In [36]:
y_train = train['sentiment']
In [37]:
y_test = test['sentiment']
In [38]:
lr.fit(X_train,y_train)
C:\Users\Humberto\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Out[38]:
LogisticRegression()
In [39]:
predictions = lr.predict(X_test)
In [40]:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predictions,y_test)
Out[40]:
array([[17307,  3918],
       [ 9151, 90220]], dtype=int64)
In [41]:
print(classification_report(predictions,y_test))
              precision    recall  f1-score   support

          -1       0.65      0.82      0.73     21225
           1       0.96      0.91      0.93     99371

    accuracy                           0.89    120596
   macro avg       0.81      0.86      0.83    120596
weighted avg       0.90      0.89      0.90    120596

In [ ]: